Code
import pandas as pd
import altair as alt
import seaborn as sns
import plotly.express as px
from vega_datasets import data
import matplotlib.pyplot as pltAmelia Baier, Andrea Dukic, Mia Mayerhofer
To provide an overview of the data, we will be looking at the data from a geographic perspective, specifically at the state level.
#calculate averages of all numeric columns
num_cols = df[['State', 'GPA', 'WorkExp', 'TestScore', 'WritingScore', 'VolunteerLevel']]
avg_df = num_cols.groupby('State').mean().reset_index()
state_abbr = {
'Alabama': 'AL',
'California': 'CA',
'Colorado': 'CO',
'Florida': 'FL',
'Georgia': 'GA',
'Mississippi': 'MS',
'New York': 'NY',
'Oregon': 'OR',
'Utah': 'UT',
'Vermont': 'VT',
'Virginia': 'VA'
}
avg_df['State_Abbr'] = avg_df['State'].map(state_abbr)
avg_df = avg_df.drop(columns=['State'])
avg_df = avg_df.rename(columns={'State_Abbr': 'State'})import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
fig = px.choropleth(avg_df, locationmode="USA-states",
locations=avg_df['State'],
scope="usa",
color=avg_df['GPA'],
hover_data={"State": True, "GPA": True},
labels={"GPA": "Selected Variable"},
color_continuous_scale=custom_palette
)
dropdown = []
for col in avg_df.columns[:-1]:
dropdown.append({'label': col, 'method': 'update', 'args': [{'z': [avg_df[col]]}]})
fig.update_layout(updatemenus=[{'buttons': dropdown, 'direction': 'down', 'showactive': True}],
title='Choropleth Map of Average Selected Variable')
fig.update_coloraxes(colorbar_title=dict(text='Selected Variable'))
fig.show()Above is a choropleth map of the average numeric feature (GPA, test score, writing score, work experience in years, and volunteer level) by state. The average of the numeric features is calculated across all decision types to obtain a holistic view of the student data by state. Below we will summarize some findings for each feature:
GPA:
Work Experience:
Test Score:
Writing Score:
Volunteer Level:
| GPA | Test Score | Writing Score |
|---|---|---|
| California has the highest average GPA, with Florida and New York close behind. | California has the highest average test score. | California has the highest average writing score. |
| Oregon and Mississippi have the lowest average GPA. | Mississippi has the lowest average test score. | New York has the lowest average writing score. |
| Work Experience | Volunteer Level | |
| Mississippi has the highest average work experience in years. | Oregon has the highest average volunteer level. | |
| Oregon has the lowest average work experience. | Alabama has the lowest average volunteer level. |
#create dataframe of rates for each state by decision
decision_state = df.groupby(['Decision', 'State'])[["GPA"]].count().reset_index()
decision_state = decision_state.rename(columns={'GPA':'StateCount'})
decision_state['DecisionCount'] = decision_state.groupby('Decision')['StateCount'].transform('sum')
decision_state['Rate'] = decision_state['StateCount'] / decision_state['DecisionCount'] * 100
state_id_dict = dict(zip(data.population_engineers_hurricanes()["state"], data.population_engineers_hurricanes()["id"]))
decision_state["StateID"] = decision_state["State"].map(state_id_dict)
admit_states = decision_state[decision_state['Decision'] == "Admit"]
decline_states = decision_state[decision_state['Decision'] == "Decline"]states = alt.topo_feature('https://raw.githubusercontent.com/vega/vega-datasets/master/data/us-10m.json', 'states')
click = alt.selection_multi(fields = ["State"])
existing_states = alt.Chart(states).mark_geoshape(stroke='black').encode(
color = alt.Color("Rate:Q", scale=alt.Scale(range=custom_palette)),
tooltip = ["State:N", "Rate:Q"],
opacity = alt.condition('isValid(datum.Rate)', alt.value(1), alt.value(0.2)),
).transform_lookup(
lookup = "id",
from_ = alt.LookupData(admit_states, "StateID", list(admit_states.columns))
).properties(width = 333, height = 200, title="Admission Rates by State").add_selection(click).project(type = "albersUsa").interactive()
missing_states = (
alt.Chart(states)
.mark_geoshape(fill = "grey", stroke = "white")
.encode(opacity=alt.condition("isValid(datum.Rate)", alt.value(0), alt.value(0.2))).add_selection(click).project(type = "albersUsa")
)
admit_map = existing_states + missing_states
admit_map = admit_map.encode(
tooltip= ["State:N", "Rate:Q"]
).transform_lookup(
lookup="id",
from_=alt.LookupData(admit_states, "StateID", list(admit_states.columns))
).interactive()
existing_states = alt.Chart(states).mark_geoshape(stroke='black').encode(
color = alt.Color("Rate:Q", scale=alt.Scale(range=custom_palette)),
tooltip = ["State:N", "Rate:Q"],
opacity = alt.condition('isValid(datum.Rate)', alt.value(1), alt.value(0.2)),
).transform_lookup(
lookup = "id",
from_ = alt.LookupData(decline_states, "StateID", list(decline_states.columns))
).properties(width = 333, height = 200, title="Rejection Rates by State").add_selection(click).project(type = "albersUsa").interactive()
missing_states = (
alt.Chart(states)
.mark_geoshape(fill = "grey", stroke = "white")
.encode(opacity=alt.condition("isValid(datum.Rate)", alt.value(0), alt.value(0.2))).add_selection(click).project(type = "albersUsa")
)
decline_map = existing_states + missing_states
decline_map = decline_map.encode(
tooltip= ["State:N", "Rate:Q"]
).transform_lookup(
lookup="id",
from_=alt.LookupData(decline_states, "StateID", list(decline_states.columns))
).interactive()
admit_map | decline_mapBefore applying machine learning, below we will explain the reasoning behind applying ML to the student admissions data. We wanted to identify relationships between the academic features in the data on the decision, namely GPA, writing score, and test score. The motivation behind this is to use the information of the relationships, if any are present, to help students in our university understand what features of their application might contribute to the decision. We then hope to provide targeted help to our students based on our findings to increase the number of students admitted to internships.
Above is the pairplot of GPA, writing score, and test score of the students grouped by the decision. When looking at the scatterplots, we notice some patterns:
Through the pairplot we can see that some of the academic features have relationships by decision result, but some features seem to be more important than others. We will investigate the importance of all features on decision using XGBoost and Shapley values.